Data Exploration of H-MOG Dataset

Created by Holger Buech, Q1/2019

Description

Basic statistics & visualizations for the H-MOG Dataset, especially considering aggregation on sessions and subjects.

Purpose

  • Get basic understanding of dataset
  • Check for completeness
  • Inspect data distribution

Data Sources

Preparations

Imports

In [1]:
# Standard
from pathlib import Path
import os
import sys
import datetime

# Extra
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.cm as cm
import matplotlib.style as style
import seaborn as sns

# `DatasetLoader` is a custom helper class to retrieve data from hdf5 file
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.utility.dataset_loader_hdf5 import DatasetLoader
In [2]:
%matplotlib inline
%run utils.ipynb
utils_set_output_style()

Configuration

In [3]:
# Various Settings
TABLE_NAME = "sensors_100hz"  # Table with raw sensor data
HMOG_HDF5 = Path.cwd().parent / "data" / "processed" / "hmog_dataset.hdf5"
SEED = 712

FEATURE_COLS = [
    "acc_x",
    "acc_y",
    "acc_z",
    "gyr_x",
    "gyr_y",
    "gyr_z",
    "mag_x",
    "mag_y",
    "mag_z",
]

REPORT_PATH = Path.cwd().parent / "reports" / "figures"  # Figures for thesis
REPORT_PATH.mkdir(parents=True, exist_ok=True)
In [4]:
hmog = DatasetLoader(
    hdf5_file=HMOG_HDF5,
    table_name=TABLE_NAME,
    max_subjects=None,
    task_types=[],
    exclude_subjects=[],   
    exclude_cols=[],
    seed=SEED,
)
hmog.data_summary()

Out[4]:
DataFrame Memory (MB) Rows Columns Subjects Sessions
0 all 13239.87 123955466 13 100 2392
1 index 0.09 2392 4 100 2392

Exploration

Basics

In [5]:
hmog.all.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 123955466 entries, 0 to 34105
Data columns (total 13 columns):
acc_x        float64
acc_y        float64
acc_z        float64
gyr_x        float64
gyr_y        float64
gyr_z        float64
mag_x        float64
mag_y        float64
mag_z        float64
sys_time     int64
subject      object
session      object
task_type    int64
dtypes: float64(9), int64(2), object(2)
memory usage: 12.9+ GB
In [6]:
hmog.all.head()
Out[6]:
acc_x acc_y acc_z gyr_x gyr_y gyr_z mag_x mag_y mag_z sys_time subject session task_type
0 0.240019 7.783550 5.087678 0.023729 -0.087669 -0.298940 -17.868000 15.582000 -30.400000 1396226206090 100669 100669_session_1 0
1 0.230074 7.788090 5.080470 0.024885 -0.085446 -0.296594 -17.855675 15.556771 -30.374313 1396226206100 100669 100669_session_1 0
2 0.220130 7.792631 5.073262 0.026041 -0.083223 -0.294248 -17.843349 15.531542 -30.348626 1396226206110 100669 100669_session_1 0
3 0.210185 7.797171 5.066054 0.027196 -0.080999 -0.291902 -17.831024 15.506313 -30.322939 1396226206120 100669 100669_session_1 0
4 0.200241 7.801712 5.058845 0.028352 -0.078776 -0.289556 -17.818699 15.481084 -30.297253 1396226206130 100669 100669_session_1 0
In [7]:
hmog.all.describe(include="all", percentiles=[])
Out[7]:
acc_x acc_y acc_z gyr_x gyr_y gyr_z mag_x mag_y mag_z sys_time subject session task_type
count 1.239555e+08 1.239555e+08 1.239555e+08 1.239555e+08 1.239555e+08 1.239555e+08 1.239555e+08 1.239555e+08 1.239555e+08 1.239555e+08 123955466 123955466 1.239555e+08
unique NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 100 2392 NaN
top NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 256487 771782_session_5 NaN
freq NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2326559 224813 NaN
mean 1.754039e-01 4.671516e+00 7.644874e+00 1.182403e-02 1.010373e-02 7.436655e-03 -2.721895e+00 -4.509907e+00 -1.255342e+01 1.399221e+12 NaN NaN 3.363778e+00
std 1.866653e+00 2.162596e+00 1.365915e+00 1.709596e-01 2.086806e-01 2.072633e-01 1.545500e+01 1.610624e+01 2.728261e+01 1.814971e+09 NaN NaN 1.683130e+00
min -1.960911e+01 -1.958756e+01 -1.960851e+01 -9.725586e+00 -1.000842e+01 -9.998948e+00 -4.075450e+02 -1.315610e+02 -1.291682e+03 1.396222e+12 NaN NaN 0.000000e+00
50% -1.653495e-02 4.914099e+00 7.797258e+00 9.869839e-03 2.392533e-03 -1.269342e-02 -2.927727e+00 -4.447000e+00 -1.592020e+01 1.399418e+12 NaN NaN 3.000000e+00
max 1.960791e+01 1.814266e+01 1.960851e+01 7.338324e+00 1.004660e+01 8.924130e+00 2.086270e+02 4.701380e+02 4.869550e+02 1.402767e+12 NaN NaN 6.000000e+00
In [8]:
hmog.all.isna().sum()
Out[8]:
acc_x        0
acc_y        0
acc_z        0
gyr_x        0
gyr_y        0
gyr_z        0
mag_x        0
mag_y        0
mag_z        0
sys_time     0
subject      0
session      0
task_type    0
dtype: int64
In [9]:
# Cast subjects to categorical for plotting & memory saving
hmog.all["subject"] = hmog.all["subject"].astype("category")

Interpretation:

  • All data types look correct
  • Ranges for sensordata seem reasonable
  • No missing values

Example Session

In [10]:
np.random.seed(SEED)
session = np.random.choice(hmog.all["session"].unique())
session
Out[10]:
'151985_session_7'
In [11]:
df_single_session = hmog.all[hmog.all["session"] == session].copy()
df_single_session["Session Time"] = pd.to_datetime(df_single_session['sys_time'], unit='ms')
df_single_session = df_single_session.set_index("Session Time")
df_single_session.head()
Out[11]:
acc_x acc_y acc_z gyr_x gyr_y gyr_z mag_x mag_y mag_z sys_time subject session task_type
Session Time
2014-05-20 00:15:49.700 -0.130012 3.401216 8.967853 0.139888 0.557415 -0.054978 -28.317000 24.320000 -52.670000 1400544949700 151985 151985_session_7 4
2014-05-20 00:15:49.710 -0.141806 3.417622 8.956919 0.154025 0.542415 -0.041146 -28.248717 24.226383 -52.636666 1400544949710 151985 151985_session_7 4
2014-05-20 00:15:49.720 -0.153600 3.434027 8.945985 0.168162 0.527417 -0.027314 -28.180436 24.132769 -52.603333 1400544949720 151985 151985_session_7 4
2014-05-20 00:15:49.730 -0.165394 3.450432 8.935051 0.182299 0.512418 -0.013483 -28.112155 24.039155 -52.570000 1400544949730 151985 151985_session_7 4
2014-05-20 00:15:49.740 -0.177188 3.466838 8.924117 0.196437 0.497419 0.000349 -28.043872 23.945538 -52.536667 1400544949740 151985 151985_session_7 4
In [12]:
print("Session duration according to timestamps:")
df_single_session.index.max() - df_single_session.index.min()
Session duration according to timestamps:
Out[12]:
Timedelta('0 days 00:12:27.750000')
In [13]:
print("Session duration value count & frequency:")
sec = datetime.timedelta(seconds=len(df_single_session) / 100)
str(sec)
Session duration value count & frequency:
Out[13]:
'0:12:27.760000'
In [14]:
print(f"Task Type: {int(df_single_session.task_type.unique()[0])}")
print("where\n[1, 3, 5] are sitting \n[2, 4, 6] are walking")
Task Type: 4
where
[1, 3, 5] are sitting 
[2, 4, 6] are walking
In [15]:
plot_rows = [
    ["acc_x", "acc_y", "acc_z"],
    ["gyr_x", "gyr_y", "gyr_z"],
    ["mag_x", "mag_y", "mag_z"],
]

f, axes = plt.subplots(3, 1, dpi=180, figsize=(5.473, 2))
for i, row in enumerate(plot_rows):
    g = df_single_session[row].plot(linewidth=0.3, ax=axes[i], sharex="col")
    g.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.0)

utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-session-sample.png")

Overall Sensor Data Distribution

In [16]:
df_temp = hmog.all[FEATURE_COLS]
df_temp.columns = ["$" + c.capitalize()  + "$" for c in df_temp.columns]

f, axes = plt.subplots(3, 3, sharex="col", sharey="col", dpi=180, figsize=(5.473, 2))
f.subplots_adjust(hspace=0.6, wspace=0.2)
cmap = cm.get_cmap("tab10")

for i, col in enumerate(df_temp.columns):
    plot_column = int(i // 3)
    plot_row = i - plot_column * 3

    g = sns.distplot(
        df_temp[col],
        kde=False,
        ax=axes[plot_row][plot_column],
        color=cmap(plot_column),
        hist_kws=dict(alpha=1),
    )
    g.set_title(f"{col}")
    g.set_yscale("log")
    g.axes.set_xlabel("")

utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-sensor-distribution.png")

Interpretation:

  • The distributions vary a lot from sensor to sensor
  • Accelerometer: Left skewed y- and z-axis. Probably towards 10 because of gravity component. Less skewness in x-axis, which is Platykurtic.
  • Gyroscope: very smooth, probably close to normal. Only x-axis is slightly left skewed
  • Magnetometer: Strong skewness in all axis. z-axis has some massive outliers.

Distribution per Subjects

Sensor Data Distribution

In [17]:
for i, col in enumerate(FEATURE_COLS):
    plt.figure(i, figsize=(20, 30))
    g = sns.boxplot(
        y="subject",
        x=col,
        data=hmog.all,
        orient="h",
        fliersize=2,
        color="tab:blue",
        saturation=1,
    )
    g.set_title(f'Distribution of "{col}" per Subject')
    g.axes.set_xlabel("")
    g.axes.xaxis.grid(True)
    g.axes.yaxis.grid(True)
In [18]:
# Partial plot for use in thesis
subjects = [
    "256487",
    "257279",
    "261313",
    "264325",
    "277905",
    "278135",
    "326223",
    "336172",
    "342329",
    "352716",
]
df_temp = hmog.all[hmog.all["subject"].isin(subjects)].copy(deep=True)
df_temp["Subject"] = df_temp["subject"].cat.remove_unused_categories()
df_temp = df_temp.rename(columns={"mag_z": "$Mag_z$"})

plt.figure(dpi=180, figsize=(5.473, 2))
g = sns.boxplot(
    y="Subject",
    x="$Mag_z$",
    data=df_temp,
    orient="h",
    **utils_boxplot_style
)
g.axes.xaxis.grid(True)
g.axes.yaxis.grid(True)

utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-mag-outlier.png")

Sessions per Subjects

In [19]:
# Add session number as column
hmog.all["session_number"] = hmog.all["session"].str.rsplit("_", n=1).str[-1].astype("int")
hmog.all.head()
Out[19]:
acc_x acc_y acc_z gyr_x gyr_y gyr_z mag_x mag_y mag_z sys_time subject session task_type session_number
0 0.240019 7.783550 5.087678 0.023729 -0.087669 -0.298940 -17.868000 15.582000 -30.400000 1396226206090 100669 100669_session_1 0 1
1 0.230074 7.788090 5.080470 0.024885 -0.085446 -0.296594 -17.855675 15.556771 -30.374313 1396226206100 100669 100669_session_1 0 1
2 0.220130 7.792631 5.073262 0.026041 -0.083223 -0.294248 -17.843349 15.531542 -30.348626 1396226206110 100669 100669_session_1 0 1
3 0.210185 7.797171 5.066054 0.027196 -0.080999 -0.291902 -17.831024 15.506313 -30.322939 1396226206120 100669 100669_session_1 0 1
4 0.200241 7.801712 5.058845 0.028352 -0.078776 -0.289556 -17.818699 15.481084 -30.297253 1396226206130 100669 100669_session_1 0 1
In [20]:
df_subjects = (
    hmog.all.groupby("subject").agg({"session": ["nunique", "count"]}).reset_index()
)
df_subjects.columns = ["subject", "sessions", "samples"]
df_subjects["mean_min_per_session"] = df_subjects["samples"] / df_subjects["sessions"] / 100 / 60
df_subjects = df_subjects.sort_values("samples", ascending=False).reset_index(drop=True)
df_subjects.head(3)
Out[20]:
subject sessions samples mean_min_per_session
0 256487 24 2326559 16.156660
1 389015 24 2152787 14.949910
2 856401 24 2028491 14.086743
In [21]:
plt.figure(dpi=180, figsize=(8, 2))
clrs = [
    "tab:red" if (x in ("733162", "796581", "526319")) else "tab:blue"
    for x in df_subjects["subject"]
]
sns.barplot(
    x="subject",
    y="sessions",
    data=df_subjects,
    palette=clrs,
    order=df_subjects["subject"],
    saturation=1,
)
plt.xticks(rotation=90, fontsize=4);

Samples per Subjects

In [22]:
plt.figure(dpi=180, figsize=(8, 2))
clrs = [
    "tab:red" if (x in ("733162", "796581", "526319")) else "tab:blue"
    for x in df_subjects["subject"]
]
sns.barplot(
    x="subject",
    y="samples",
    data=df_subjects,
    palette=clrs,
    order=df_subjects["subject"],
    saturation=1,
)
plt.xticks(rotation=90, fontsize=4);
In [23]:
# Partial plot for use in thesis
df_temp = df_subjects
df_temp = pd.concat([df_temp.head(14), df_temp.tail(18)])
df_temp["subject"] = df_temp["subject"].astype(str)
df_temp.loc[df_temp["subject"] == "771782", "samples"] = 0
df_temp.loc[df_temp["subject"] == "771782", "subject"] = "..."


plt.figure(dpi=180, figsize=(5.473, 2))
clrs = [
    "tab:red" if (x in ("733162", "796581", "526319")) else "tab:blue"
    for x in df_temp["subject"]
]

g = sns.barplot(
    x="subject",
    y="samples",
    data=df_temp,
    palette=clrs,
    order=df_temp["subject"],
    saturation=1,
)
g.set_ylabel("Samples")
g.set_xlabel("Subjects")
plt.xticks(rotation=90)

utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-samples-dist.pdf")

Task Types per Subjects

In [24]:
# Add session task type as dummies
df_task_dummies = pd.get_dummies(hmog.index['task_type'])
dummy_columns = ["taskid_" + str(col) for col in df_task_dummies.columns]
df_task_dummies.columns = dummy_columns
df_task_types = pd.concat([hmog.index, df_task_dummies], axis=1)
In [25]:
df_tasks = (
    df_task_types.groupby("subject")[dummy_columns].sum()
)
In [26]:
# Mapping according to hmog-docu:
df_tasks.columns = [
    "read + sit",
    "read + walk",
    "write + sit",
    "write + walk",
    "map + sit",
    "map + walk",
]
In [27]:
df_tasks.head()
Out[27]:
read + sit read + walk write + sit write + walk map + sit map + walk
subject
100669 4 4 4 4 4 4
151985 4 4 4 4 4 4
171538 4 4 4 4 4 4
180679 4 4 4 4 4 4
186676 4 4 4 4 4 4
In [28]:
fig = plt.figure(dpi=180, figsize=(7, 1.4))
g = df_tasks.plot(kind="bar", stacked=True, width=0.6, linewidth=0, ax=plt.gca())
g.legend(bbox_to_anchor=(0.5, -0.38), loc="upper center", ncol=6, fontsize=5)
g.tick_params(axis="both", which="major", pad=0)
g.set_xlabel("Subjects", fontsize=6)
g.set_ylabel("Sessions", fontsize=6)
plt.yticks(np.arange(0, 28, 4.0), fontsize=6)
plt.xticks(fontsize=4)
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-tasks-dist.pdf")
In [29]:
df_tasks[df_tasks < 4].dropna(axis=0, how="all")
Out[29]:
read + sit read + walk write + sit write + walk map + sit map + walk
subject
526319 NaN NaN NaN NaN NaN 3.0
733162 NaN NaN 3.0 3.0 3.0 1.0
796581 3.0 NaN NaN NaN NaN NaN

Interpretation: The three subjects above have missing sessions (< 4) for certain task types and should be excluded.

Distributions per Sessions

In [30]:
aggs = ["mean", "std"]
df_session = hmog.all.groupby(["subject", "session_number"]).agg(
    {
        "acc_x": aggs,
        "acc_y": aggs,
        "acc_z": aggs,
        "gyr_x": aggs,
        "gyr_y": aggs,
        "gyr_z": aggs,
        "mag_x": aggs,
        "mag_y": aggs,
        "mag_z": aggs,
        "task_type": ["median", "count"],
    }
)
In [31]:
df_session.columns = ["_".join(col) for col in df_session.columns]
df_session = df_session.rename(columns={"task_type_count": "value_count"}).reset_index()
df_session["Session Duration in Minutes"] = df_session["value_count"] / 100 / 60
df_session["Samples per Session"] = df_session["value_count"]
In [32]:
df_session.head()
Out[32]:
subject session_number acc_x_mean acc_x_std acc_y_mean acc_y_std acc_z_mean acc_z_std gyr_x_mean gyr_x_std gyr_y_mean gyr_y_std gyr_z_mean gyr_z_std mag_x_mean mag_x_std mag_y_mean mag_y_std mag_z_mean mag_z_std task_type_median value_count Session Duration in Minutes Samples per Session
0 100669 1 -0.520849 0.939163 6.693916 0.847856 6.247475 0.724429 -0.002640 0.067646 -0.015322 0.057995 -0.025233 0.096079 -17.160741 2.112589 20.278229 2.930276 -29.528778 1.308334 1 78117 13.019500 78117
1 100669 2 -0.229571 0.407601 7.460647 0.619245 5.514775 0.820172 -0.002929 0.093826 -0.025059 0.099088 -0.023074 0.047261 -15.888098 1.365652 16.732562 2.442309 -26.565749 1.426067 3 109172 18.195333 109172
2 100669 3 -0.210573 0.452553 6.848979 1.466300 5.977902 1.195907 0.000237 0.087485 -0.023031 0.089389 -0.022711 0.044976 -16.135071 0.751890 18.551714 5.078402 -27.703363 1.806159 3 94743 15.790500 94743
3 100669 4 0.013836 0.688999 4.706553 0.729543 7.741479 0.534117 0.003221 0.093958 0.016462 0.205066 0.055296 0.253672 -19.743393 7.655460 11.751817 4.310025 -39.175828 4.875234 2 51969 8.661500 51969
4 100669 5 -0.038892 0.562523 4.622535 0.624875 7.971410 0.534906 0.008376 0.094628 -0.008233 0.209332 0.020011 0.247850 -2.400062 6.579658 -10.824765 9.607490 -42.493030 27.933139 6 37088 6.181333 37088
In [33]:
print(f"Histogramm of session duration, for all {len(df_session)} sessions")
mean_duration = df_session["Session Duration in Minutes"].mean()
fig = plt.figure(dpi=180, figsize=(5.473, 2))
g = sns.distplot(
    df_session["Session Duration in Minutes"], kde=False, hist_kws=dict(alpha=1, lw=0.5)
)
g.set_xlabel("Session Duration in Minutes", fontsize=6)
plt.plot(
    [mean_duration, mean_duration], [0, 220], linestyle="dashed", color=MAGENTA, lw=0.8
)
plt.text(
    mean_duration + 0.5,
    180,
    "mean",
    fontsize=6,
    color=MAGENTA,
    horizontalalignment="left",
)
plt.text(
    mean_duration + 0.5,
    165,
    f"({mean_duration:.1f} min)",
    fontsize=5,
    color=MAGENTA,
    horizontalalignment="left",
)
plt.xticks(fontsize=6)
plt.yticks(fontsize=6)
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-session-duration.pdf")
Histogramm of session duration, for all 2392 sessions
In [34]:
print(f"Histogramm of session samples, for all {len(df_session)} sessions")
fig = plt.figure(dpi=180, figsize=(5.473, 2))
g = sns.distplot(df_session["Samples per Session"], kde=False, hist_kws=dict(alpha=1))
g.set_xlabel("Samples per Session", fontsize=6)
plt.plot([24000, 24000], [0, 220], linestyle="dashed", color=MAGENTA, lw=0.8)
plt.text(21000, 170, "24000", fontsize=6, color=MAGENTA, horizontalalignment="right")
plt.text(21000, 155, "(4 min)", fontsize=5, color=MAGENTA, horizontalalignment="right")
plt.xticks(fontsize=6)
plt.yticks(fontsize=6)
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-session-samples.pdf")
Histogramm of session samples, for all 2392 sessions
In [35]:
df_session[["subject", "session_number", "Session Duration in Minutes"]].sort_values(
    "Session Duration in Minutes"
).head(20)
Out[35]:
subject session_number Session Duration in Minutes
215 219303 24 0.846167
1340 717868 22 0.912500
213 219303 22 0.970000
907 539502 21 1.166667
208 219303 17 1.497833
2340 986737 21 1.607000
906 539502 20 1.623500
989 556357 7 1.644667
203 219303 12 1.779833
1384 733162 24 1.835000
1441 745224 9 1.835333
901 539502 15 1.958333
1614 785899 14 2.006500
2324 986737 5 2.027667
984 556357 2 2.046833
1661 799296 14 2.055667
1647 796581 23 2.145500
900 539502 14 2.283000
545 366286 18 2.287667
1418 737973 10 2.353333
In [36]:
g = sns.FacetGrid(df_session, col="subject", col_wrap=7, hue="task_type_median")
g = g.map(plt.bar, "session_number", "Session Duration in Minutes").fig.subplots_adjust(
    wspace=0.15, hspace=0.15
)

Correllations

In [37]:
df_session.head()
Out[37]:
subject session_number acc_x_mean acc_x_std acc_y_mean acc_y_std acc_z_mean acc_z_std gyr_x_mean gyr_x_std gyr_y_mean gyr_y_std gyr_z_mean gyr_z_std mag_x_mean mag_x_std mag_y_mean mag_y_std mag_z_mean mag_z_std task_type_median value_count Session Duration in Minutes Samples per Session
0 100669 1 -0.520849 0.939163 6.693916 0.847856 6.247475 0.724429 -0.002640 0.067646 -0.015322 0.057995 -0.025233 0.096079 -17.160741 2.112589 20.278229 2.930276 -29.528778 1.308334 1 78117 13.019500 78117
1 100669 2 -0.229571 0.407601 7.460647 0.619245 5.514775 0.820172 -0.002929 0.093826 -0.025059 0.099088 -0.023074 0.047261 -15.888098 1.365652 16.732562 2.442309 -26.565749 1.426067 3 109172 18.195333 109172
2 100669 3 -0.210573 0.452553 6.848979 1.466300 5.977902 1.195907 0.000237 0.087485 -0.023031 0.089389 -0.022711 0.044976 -16.135071 0.751890 18.551714 5.078402 -27.703363 1.806159 3 94743 15.790500 94743
3 100669 4 0.013836 0.688999 4.706553 0.729543 7.741479 0.534117 0.003221 0.093958 0.016462 0.205066 0.055296 0.253672 -19.743393 7.655460 11.751817 4.310025 -39.175828 4.875234 2 51969 8.661500 51969
4 100669 5 -0.038892 0.562523 4.622535 0.624875 7.971410 0.534906 0.008376 0.094628 -0.008233 0.209332 0.020011 0.247850 -2.400062 6.579658 -10.824765 9.607490 -42.493030 27.933139 6 37088 6.181333 37088

Sensor Data by Scenario

In [38]:
df_temp = df_session[
    [
        "acc_x_mean",
        "acc_y_mean",
        "acc_z_mean",
        "gyr_x_mean",
        "gyr_y_mean",
        "gyr_z_mean",
        "mag_x_mean",
        "mag_y_mean",
        "mag_z_mean",
        "subject",
        "task_type_median",
    ]
].reset_index()
df_temp = df_temp.reset_index(drop="True")
df_temp["scenario"] = np.where(
    df_temp["task_type_median"].isin([2, 4, 6]), "walk", "sit"
)
df_temp["subject"] = df_temp["subject"].astype(str) + " ."  # Workaround bug in seaborn
df_temp = df_temp.drop(columns=["task_type_median", "index"])

fig = plt.figure(dpi=180, figsize=(5.473, 5.473))
sns.pairplot(df_temp, hue="scenario", palette="tab10", plot_kws={"s": 15})
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-sit-walk.pdf")
<Figure size 985.14x985.14 with 0 Axes>

Sensor Data by Subject

In [39]:
df_temp = df_session[
    [
        "acc_x_mean",
        "acc_y_mean",
        "acc_z_mean",
        "gyr_x_mean",
        "gyr_y_mean",
        "gyr_z_mean",
        "mag_x_mean",
        "mag_y_mean",
        "mag_z_mean",
        "subject",
    ]
].reset_index()
df_temp = df_temp.reset_index(drop=True)
df_temp["subject"] = df_temp["subject"].astype(str) + " ."  # Workaround bug in seaborn
np.random.seed(SEED)
random_subjects = np.random.choice(df_temp["subject"].unique(), size=3)
df_temp = df_temp[df_temp["subject"].isin(list(random_subjects))]
df_temp = df_temp.drop(columns=["index"])

fig = plt.figure(dpi=180, figsize=(5.473, 5.473))
g = sns.pairplot(df_temp, hue="subject", palette="tab10", plot_kws={"s": 30})
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-three-subjects.pdf")
<Figure size 985.14x985.14 with 0 Axes>

Combined partial plot for use in thesis

In [40]:
# Prepare Data
df_temp = df_session[
    [
        "acc_x_mean",
        "acc_y_mean",
        "acc_z_mean",
        "task_type_median",
        "subject",
    ]
].reset_index()

df_temp["Scenario"] = np.where(
    df_temp["task_type_median"].isin([2, 4, 6]), "walk", "sit"
)
df_temp = df_temp.drop(columns=["task_type_median"])

df_temp = df_temp.rename(
    columns={
        "acc_x_mean": "$mean(Acc_x)$",
        "acc_y_mean": "$mean(Acc_y)$",
        "acc_z_mean": "$mean(Acc_z)$",
        "subject": "Subject",
    }
)
df_temp = df_temp.reset_index(drop=True)
df_temp["Subject"] = df_temp["Subject"].astype(str) + " ."  # Workaround bug in seaborn
np.random.seed(SEED)
random_subjects = np.random.choice(df_temp["Subject"].unique(), size=3)
df_temp = df_temp[df_temp["Subject"].isin(list(random_subjects))]
df_temp = df_temp.drop(columns=["index"])
In [41]:
# Plot separately
plt.ioff()
g1 = sns.pairplot(df_temp, hue="Subject", palette=cmap.colors, plot_kws={"s": 15})
g1.fig.set_size_inches(6.5, 5)
g2 = sns.pairplot(df_temp, hue="Scenario", palette=cmap.colors[6:], plot_kws={"s": 15})
for ax in g2.axes.flatten():
    ax.set_ylabel("")
g2.fig.set_size_inches(13, 5)
g2.fig.subplots_adjust(right=0.805)
g2.fig.subplots_adjust(left=0.455)
In [42]:
# Combine both plots
f = plt.figure(figsize=(14.3, 5))
for g in [g1, g2]:
    g.fig.subplots_adjust(top=0.99, bottom=0.15)
    for ax in g.fig.axes:
        f._axstack.add(f._make_key(ax), ax)

custom_lines = [
    Line2D([0], [0], color=cmap(0), marker="o", lw=0),
    Line2D([0], [0], color=cmap(1), marker="o", lw=0),
    Line2D([0], [0], color=cmap(2), marker="o", lw=0),
]
f.legend(
    custom_lines,
    ["588087", "698266", "893255"],
    title="Subjects",
    loc="upper right",
    handlelength=0.2,
    bbox_to_anchor=(0.312, 0.5, 0.5, 0.5),
)
custom_lines = [
    Line2D([0], [0], color=cmap(6), marker="o", lw=0),
    Line2D([0], [0], color=cmap(7), marker="o", lw=0),
]
f.legend(
    custom_lines,
    ["Walking ", "Sitting"],
    title="Scenarios",
    loc="upper right",
    handlelength=0.2,
    bbox_to_anchor=(0.319, 0.2, 0.5, 0.5),
)

utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-session-means.pdf")
In [ ]: